home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
SGI Freeware 2002 November
/
SGI Freeware 2002 November - Disc 2.iso
/
dist
/
fw_glimpse.idb
/
usr
/
freeware
/
src
/
glimpse-3.0
/
compress
/
tbuild.c.z
/
tbuild.c
Wrap
C/C++ Source or Header
|
1997-09-09
|
10KB
|
308 lines
/* Copyright (c) 1994 Burra Gopal, Udi Manber. All Rights Reserved. */
/*
* build.c: Reads a file = list of indices and hashes frequently used
* words into a hash-table for fast retrieval of a word's no.
* Also maintains a string-table to get to the word from it no.
*/
#include "defs.h"
/* small tables are ok since computation is a 1-time job: plus we need the extra space if we put it in a library for glimpseindex */
extern hash_entry *get_small_hash(), *insert_small_hash();
hash_entry *dict_hash_table[SMALL_HASH_TABLE_SIZE];
hash_entry *freq_strings_table[MAX_WORD_LEN+2];
int freq_strings_num[MAX_WORD_LEN+2];
extern int MAX_WORDS;
extern int RESERVED_CHARS;
/*
* Computes the dictionary for the compression routines: they need two things:
* 1. A hash-table which maps words to word numbers.
* 2. A string-table which maps word numbers to offsets where the word begins
* in the input file to this program (default DEF_INDEX_FILE).
* They are o/p in two output files, hash_file and string_file (default).
*
* Algorithm: first build the hash table of atmost 65536 words. Then traverse
* the table and output the hash/string files in the above format.
*/
int
compute_dictionary(THRESHOLD, FILEBLOCKSIZE, SPECIAL_WORDS, comp_dir)
int THRESHOLD, FILEBLOCKSIZE;
char comp_dir[];
{
int c;
unsigned char curline[MAX_NAME_LEN];
int curchar = 0;
unsigned char curword[MAX_NAME_LEN];
int curfreq;
int curoffset = 0;
int curlen;
int wordindex = 0;
FILE *fp, *freqfp, *tempfp, *awkfp;
int pid = getpid();
int i;
hash_entry *e, **t, *p;
int dummy, offset, len;
unsigned char s[MAX_LINE_LEN];
unsigned char rands[MAX_LINE_LEN];
char index_file[MAX_LINE_LEN], string_file[MAX_LINE_LEN], hash_file[MAX_LINE_LEN], freq_file[MAX_LINE_LEN];
strcpy(hash_file, comp_dir);
strcat(hash_file, "/");
strcat(hash_file, DEF_HASH_FILE);
strcpy(freq_file, comp_dir);
strcat(freq_file, "/");
strcat(freq_file, DEF_FREQ_FILE);
strcpy(string_file, comp_dir);
strcat(string_file, "/");
strcat(string_file, DEF_STRING_FILE);
strcpy(index_file, comp_dir);
strcat(index_file, "/");
strcat(index_file, DEF_INDEX_FILE);
memset(dict_hash_table, '\0', sizeof(hash_entry *) * SMALL_HASH_TABLE_SIZE);
memset(freq_strings_table, '\0', sizeof(hash_entry *) * (MAX_WORD_LEN+2));
memset(freq_strings_num, '\0', sizeof(int) * (MAX_WORD_LEN+2));
if (THRESHOLD < 0) THRESHOLD = DEF_THRESHOLD;
if (SPECIAL_WORDS < 0) SPECIAL_WORDS = DEF_SPECIAL_WORDS;
if (FILEBLOCKSIZE < 0) FILEBLOCKSIZE = DEF_BLOCKSIZE;
if (THRESHOLD < MIN_WORD_LEN || THRESHOLD > MAX_THRESHOLD) {
fprintf(stderr, "threshold must be in [%d, %d]\n", MIN_WORD_LEN, MAX_THRESHOLD);
return -1;
}
if ((SPECIAL_WORDS < 0) || (SPECIAL_WORDS > 256-MAX_SPECIAL_CHARS)) {
fprintf(stderr, "invalid special words %d: must be in [0, %d]\n", SPECIAL_WORDS, 256-MAX_SPECIAL_CHARS);
return -1;
}
RESERVED_CHARS = SPECIAL_WORDS + MAX_SPECIAL_CHARS;
MAX_WORDS = MAX_LSB*(256-RESERVED_CHARS);
if ((fp = fopen(index_file, "r")) == NULL) {
fprintf(stderr, "cannot open for reading: %s\n", index_file);
return -1;
}
sprintf(s, "/tmp/temp%d", pid);
if ((tempfp = fopen(s, "w")) == NULL) {
fprintf(stderr, "cannot open for writing: %s\n", s);
fclose(fp);
return -1;
}
while((c = getc(fp)) != EOF) {
if (curchar >= MAX_NAME_LEN) {
curchar = 0;
while((c = getc(fp) != '\n') && (c != EOF));
if (c == EOF) break;
else continue;
}
curline[curchar++] = (unsigned char)c;
if (c == '\n') { /* reached end of record */
int i = 0;
if (curline[0] == '%') { /* initial lines */
curchar = 0;
continue;
}
curword[0] = '\0';
while ((i<curchar) && (curline[i] != WORD_END_MARK)) {
curword[i] = curline[i]; i++;
}
curword[i++] = '\0'; /* skip over WORD_END_MARK */
curlen = strlen((char *)curword);
curfreq = 0;
while(i<curchar) {
unsigned char tempbuf[MAX_NAME_LEN];
int j = 0;
while(isdigit(curline[i])) tempbuf[j++] = curline[i++];
tempbuf[j] = '\0';
curfreq += atoi(tempbuf);
i ++; /* skip over current ' ' or '\n' */
}
#if 0
printf("curlen=%d curfreq=%d\n", curlen, curfreq);
#endif /*0*/
if ((curlen >= MIN_WORD_LEN) && (curlen*curfreq >= THRESHOLD)) {
fprintf(tempfp, "%d %d %s\n", curlen*curfreq, curoffset, curword);
wordindex ++;
}
curoffset += curchar; /* offset cannot begin at 0: .index_list starts with %% and some junk */
curchar = 0;
#if 0
printf("word=%s freq=%d\n", curword, curfreq);
#endif /*0*/
}
}
fclose(fp);
/*
* Now chose MAX_WORDS words with highest frequency.
*/
fflush(tempfp);
fclose(tempfp);
if (wordindex <= SPECIAL_WORDS) {
fprintf(stderr, "Warning: very small dictionary with only %d words!\n", wordindex);
}
sprintf(s, "sort -n -r /tmp/temp%d > /tmp/sort%d\n", pid, pid);
system(s);
sprintf(s, "rm /tmp/temp%d\n", pid);
system(s);
sprintf(s, "head -%d /tmp/sort%d > /tmp/temp%d\n", MAX_WORDS, pid, pid);
system(s);
/*
* The first ultra-frequent 32 words are stored in a separate table sorted by
* lengths and within that according to alphabetical order (=canonical order).
*/
sprintf(s, "/tmp/temp%d", pid);
if ((tempfp = fopen(s, "r")) == NULL) {
fprintf(stderr, "cannot open for reading: %s\n", s);
return -1;
}
for (i=0; i<SPECIAL_WORDS; i++) {
if (3 != fscanf(tempfp, "%d %d %s\n", &dummy, &offset, s))
break;
len = strlen((char *)s);
e = (hash_entry *)malloc(sizeof(hash_entry));
e->word = (char *)malloc(len + 2);
e->next = NULL;
strcpy(e->word, (char *)s);
/* I'm not worried about the offsets now */
t = &freq_strings_table[len];
while(*t != NULL) {
if (strcmp((char *)s, (*t)->word) < 0) {
e->next = *t;
break;
}
t = &(*t)->next;
}
*t = e; /* valid in both cases */
freq_strings_num[len]++;
}
/*
* Put all the other words in the hash/string tables
*/
for (; i<MAX_WORDS; i++) {
if (3 != fscanf(tempfp, "%d %d %s", &dummy, &offset, s)) break;
insert_small_hash(dict_hash_table, s, strlen((char *)s), -1, offset); /* dummy doesn't matter now: its is just a computed-value for sort */
}
fclose(tempfp);
/*
* At this point, the hash-table and freq-words-table have been computed properly: dump them
*/
if ((freqfp = fopen(freq_file, "w")) == NULL) {
fprintf(stderr, "cannot open for writing: %s\n", freq_file);
return -1;
}
/* random number signature for this set of dictionaries will be in the freq-file (.glimpse_quick) */
srand(getpid());
rands[0] = '\0';
while (strlen((char*)rands) < SIGNATURE_LEN - 1) {
sprintf(s, "%x", rand());
strcat((char *)rands, (char *)s);
}
fwrite(rands, 1, SIGNATURE_LEN - 1, freqfp);
fputc('\n', freqfp);
fprintf(freqfp, "%d\n", SPECIAL_WORDS);
for (i=0; i<=MAX_WORD_LEN; i++) {
if (freq_strings_num[i] <= 0) continue;
fprintf(freqfp, "%d %d\n", i, freq_strings_num[i]);
e = freq_strings_table[i];
while (e!=NULL) {
fprintf(freqfp, "%s\n", e->word);
p = e;
e = e->next;
free(p->word);
free(p);
}
}
fflush(freqfp);
fclose(freqfp);
if (!dump_small_hash(dict_hash_table, hash_file)) return -1;
/*
* Now sort chosen ones case-insensitively according to the name so that
* those words all fall into the same page offset in the hash/string tables.
*/
/* Alter order of words in .hash_table */
sprintf(s, "/tmp/sort%d.a", pid);
if ((awkfp = fopen(s, "w")) == NULL) {
fprintf(stderr, "cannot open for writing: %s\n", s);
return -1;
}
sprintf(s, "BEGIN {}\n{print $3 \" \" $2 \" \" $1}\nEND {}\n");
fwrite(s, 1, strlen((char *)s), awkfp);
fflush(awkfp);
fclose(awkfp);
#if 0
sprintf(s, "cat /tmp/sort%d.a\n", pid);
system(s);
#endif /*0*/
#if 0
printf("stage1:");
getchar();
#endif /*0*/
sprintf(s, "awk -f /tmp/sort%d.a < %s > /tmp/sort%d\n", pid, hash_file, pid);
system(s);
sprintf(s, "sort -d -f /tmp/sort%d > /tmp/temp%d\n", pid, pid);
system(s);
sprintf(s, "/tmp/sort%d.a", pid);
if ((awkfp = fopen(s, "w")) == NULL) {
fprintf(stderr, "cannot open for writing: %s\n", s);
return -1;
}
sprintf(s, "%s", "BEGIN {}\n{print $3 \" \" NR-1 \" \" $1}\nEND {}\n");
fwrite(s, 1, strlen((char *)s), awkfp);
fflush(awkfp);
fclose(awkfp);
sprintf(s, "awk -f /tmp/sort%d.a < /tmp/temp%d > %s\n", pid, pid, hash_file); /* reorder and put in new word numbers */
system(s);
#if 0
printf("stage2:");
getchar();
#endif /*0*/
/* Now extract string-table, which is the set of 2nd components of the hash-table */
sprintf(s, "/tmp/sort%d.a", pid);
if ((awkfp = fopen(s, "w")) == NULL) {
fprintf(stderr, "cannot open for writing: %s\n", s);
return -1;
}
sprintf(s, "%s", "BEGIN {}\n{print $3}\nEND {}\n");
fwrite(s, 1, strlen((char *)s), awkfp);
fflush(awkfp);
fclose(awkfp);
#if 0
sprintf(s, "cat /tmp/sort%d.a\n", pid);
system(s);
#endif /*0*/
sprintf(s, "awk -f /tmp/sort%d.a < %s > %s\n", pid, hash_file, string_file);
system(s);
#if 0
printf("stage3:"); getchar();
#endif /*0*/
/*
* Now pad the hash-file and string files and store indices to words at
* page boundary so that search() on compressed files can be made fast
* -- it need not load the whole hash-table: just the page where the
* word occurs. The index files are very small (< 1K) so read is fast.
* The padded files are in binary -- this is what tcompress/tuncompress
* read-in. This is done to save space.
*/
pad_hash_file(hash_file, FILEBLOCKSIZE);
pad_string_file(string_file, FILEBLOCKSIZE);
return 0;
}